%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from math import pi
data_path = "data.csv"
df = pd.read_csv(data_path)
df.columns
df.head()
df.describe()
df.info()
df.isna().sum()
There are a lot of null values on the data
def weight_correction(df):
try:
value = float(df[:-3])
except:
value = 0
return value
df['Weight'] = df.Weight.apply(weight_correction)
df.Weight = pd.to_numeric(df.Weight)
df.Weight = df.Weight.replace(0, np.nan)
def value_to_int(df_value):
try:
value = float(df_value[1:-1])
suffix = df_value[-1:]
if suffix == 'M':
value = value * 1000000
elif suffix == 'K':
value = value * 1000
except ValueError:
value = 0
return value
df['Value'] = df['Value'].apply(value_to_int)
df['Wage'] = df['Wage'].apply(value_to_int)
df.Value = df.Value.replace(0, np.nan)
df.Wage = df.Wage.replace(0, np.nan)
df.Weight.isna().sum()
df.Weight.mean()
According to livestrong data,
df['Weight'].fillna(df.Weight.mean(), inplace = True)
df.Height.isna().sum()
plt.figure(figsize = (20, 10))
sns.countplot(x='Height', data=df)
plt.show()
According to livestrong data,
df['Height'].fillna("5'11", inplace = True)
wf_missing = df['Weak Foot'].isna()
wf_missing.sum()
weak_foot_prob = df['Weak Foot'].value_counts(normalize=True)
weak_foot_prob
df.loc[wf_missing,'Weak Foot'] = np.random.choice(weak_foot_prob.index, size=wf_missing.sum(),p=weak_foot_prob.values)
pf_missing = df['Preferred Foot'].isna()
pf_missing.sum()
df['Preferred Foot'].value_counts()
foot_distribution = df['Preferred Foot'].value_counts(normalize=True)
foot_distribution
df.loc[pf_missing, 'Preferred Foot'] = np.random.choice(foot_distribution.index, size = pf_missing.sum(), p=foot_distribution.values)
df['Preferred Foot'].value_counts()
fp_missing = df.Position.isna()
fp_missing.sum()
position_prob = df.Position.value_counts(normalize=True)
position_prob
plt.figure(figsize = (20, 10))
sns.countplot(x=df.Position, data=df)
plt.show()
df.loc[fp_missing, 'Position'] = np.random.choice(position_prob.index, p=position_prob.values, size=fp_missing.sum())
fs_missing = df['Skill Moves'].isna()
fs_missing.sum()
skill_moves_prob = df['Skill Moves'].value_counts(normalize=True)
skill_moves_prob
df.loc[fs_missing, 'Skill Moves'] = np.random.choice(skill_moves_prob.index, p=skill_moves_prob.values, size=fs_missing.sum())
bt_missing = df['Body Type'].isna()
bt_missing.sum()
bt_prob = df['Body Type'].value_counts(normalize=True)
bt_prob
df.loc[bt_missing, 'Body Type'] = np.random.choice(['Normal', 'Lean'], p=[.63,.37], size=bt_missing.sum())
wage_missing = df.Wage.isna()
wage_missing.sum()
wage_prob = df.Wage.value_counts(normalize=True)
wage_prob
df.loc[wage_missing, 'Wage'] = np.random.choice(wage_prob.index, p=wage_prob.values, size=wage_missing.sum())
for feature in df.columns:
if df[feature].dtype == 'float64':
df[feature].fillna(df[feature].mean(), inplace=True)
df['Contract Valid Until'].fillna(np.random.choice(df['Contract Valid Until']), inplace = True)
df['Loaned From'].fillna(np.random.choice(df['Loaned From']), inplace = True)
df['Joined'].fillna(np.random.choice(df['Joined']), inplace = True)
df['Jersey Number'].fillna(np.random.choice(df['Jersey Number']), inplace = True)
df['Club'].fillna(np.random.choice(df.Club), inplace = True)
df['Work Rate'].fillna(np.random.choice(df['Work Rate']), inplace = True)
df['International Reputation'].fillna(np.random.choice(df['International Reputation']), inplace = True)
df.fillna(0, inplace = True)
def defending(data):
return data[['Marking', 'StandingTackle',
'SlidingTackle']].mean().mean()
def general(data):
return data[['HeadingAccuracy', 'Dribbling', 'Curve',
'BallControl']].mean().mean()
def mental(data):
return data[['Aggression', 'Interceptions', 'Positioning',
'Vision','Composure']].mean().mean()
def passing(data):
return data[['Crossing', 'ShortPassing',
'LongPassing']].mean().mean()
def mobility(data):
return data[['Acceleration', 'SprintSpeed',
'Agility','Reactions']].mean().mean()
def power(data):
return data[['Balance', 'Jumping', 'Stamina',
'Strength']].mean().mean()
def rating(data):
return data[['Potential', 'Overall']].mean().mean()
def shooting(data):
return data[['Finishing', 'Volleys', 'FKAccuracy',
'ShotPower','LongShots', 'Penalties']].mean().mean()
# renaming a column
df.rename(columns={'Club Logo':'Club_Logo'}, inplace=True)
# adding these categories to the data
df['Defending'] = df.apply(defending, axis = 1)
df['General'] = df.apply(general, axis = 1)
df['Mental'] = df.apply(mental, axis = 1)
df['Passing'] = df.apply(passing, axis = 1)
df['Mobility'] = df.apply(mobility, axis = 1)
df['Power'] = df.apply(power, axis = 1)
df['Rating'] = df.apply(rating, axis = 1)
df['Shooting'] = df.apply(shooting, axis = 1)
players = df[['Name','Defending','General','Mental','Passing',
'Mobility','Power','Rating','Shooting','Flag','Age',
'Nationality', 'Photo', 'Club_Logo', 'Club']]
plt.figure(figsize = (20, 10))
ax = sns.countplot(x='Position', data=df, order = df['Position'].value_counts().index)
ax.set_title(label = 'Number of footballers available in each position', fontsize = 20)
plt.show()
player_features = (
'Acceleration', 'Aggression', 'Agility',
'Balance', 'BallControl', 'Composure',
'Crossing', 'Dribbling', 'FKAccuracy',
'Finishing', 'GKDiving', 'GKHandling',
'GKKicking', 'GKPositioning', 'GKReflexes',
'HeadingAccuracy', 'Interceptions', 'Jumping',
'LongPassing', 'LongShots', 'Marking', 'Penalties'
)
from math import pi
idx = 1
plt.figure(figsize=(15,45))
for position_name, features in df.groupby(df['Position'])[player_features].mean().iterrows():
top_features = dict(features.nlargest(5))
# number of variable
categories=top_features.keys()
N = len(categories)
# We are going to plot the first line of the data frame.
# But we need to repeat the first value to close the circular graph:
values = list(top_features.values())
values += values[:1]
# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]
# Initialise the spider plot
ax = plt.subplot(10, 3, idx, polar=True)
# Draw one axe per variable + add labels labels yet
plt.xticks(angles[:-1], categories, color='grey', size=8)
# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks([25,50,75], ["25","50","75"], color="grey", size=7)
plt.ylim(0,100)
plt.subplots_adjust(hspace = 0.5)
# Plot data
ax.plot(angles, values, linewidth=1, linestyle='solid')
# Fill area
ax.fill(angles, values, 'b', alpha=0.1)
plt.title(position_name, size=11, y=1.1)
idx += 1
sns.set(style = 'dark', palette = 'colorblind', color_codes = True)
x = df.Special
plt.figure(figsize = (12, 8))
ax = sns.distplot(x, bins = 50, kde = False, color = 'm')
ax.set_xlabel(xlabel = 'Special score range', fontsize = 16)
ax.set_ylabel(ylabel = 'Count of the Players',fontsize = 16)
ax.set_title(label = 'Histogram for the Speciality Scores of the Players', fontsize = 20)
plt.show()
sns.scatterplot(x = 'Special', y='Wage', data=df)
plt.rcParams['figure.figsize'] = (20, 10)
skill_df = df[df['Skill Moves'] == 5][['Name','Nationality']]
sns.countplot(x='Nationality', data=skill_df, order=skill_df.Nationality.value_counts().iloc[:5].index)
import squarify
df.Nationality.value_counts().nlargest(5).plot(kind='bar')
countries = df.Nationality.value_counts().nlargest(5).index
data_countries = df[df['Nationality'].isin(countries)]
plt.rcParams['figure.figsize'] = (12, 7)
ax = sns.violinplot(x = data_countries['Nationality'], y = data_countries['Weight'], palette = 'colorblind')
ax.set_xlabel(xlabel = 'Countries', fontsize = 9)
ax.set_ylabel(ylabel = 'Weight in lbs', fontsize = 9)
ax.set_title(label = 'Distribution of Weight of players from different countries', fontsize = 20)
import matplotlib.image as mpimg
import requests
def print_club_flag(clubs):
fig = plt.figure(figsize=(10,10))
for index, club in enumerate(clubs):
logo = df[df['Club'] == club]['Club_Logo'].iloc[0]
logo_image = "img_club_logo.jpg"
logo_flag = requests.get(logo).content
with open(logo_image, 'wb') as handler:
handler.write(logo_flag)
img=mpimg.imread(logo_image)
ax = fig.add_subplot(1, 6, index+1, xticks=[], yticks=[])
fig.tight_layout()
ax.imshow(img, interpolation="lanczos")
ax.set_title("%d. %s" %(index+1, club))
def print_national_flag(nations):
fig = plt.figure(figsize=(10, 10))
for index, nation in enumerate(nations):
logo = df[df['Nationality'] == nation]['Flag'].iloc[0]
logo_image = "img_nation_logo.jpg"
logo_flag = requests.get(logo).content
with open(logo_image, 'wb') as handler:
handler.write(logo_flag)
img=mpimg.imread(logo_image)
ax = fig.add_subplot(1, 6, index+1, xticks=[], yticks=[])
fig.tight_layout()
ax.imshow(img, interpolation="lanczos")
ax.set_title("%d. %s" %(index+1, nation))
d = {'Overall': 'Average_Rating'}
best_overall_club_df = df.groupby('Club').agg({'Overall':'mean'}).rename(columns=d)
clubs = best_overall_club_df.Average_Rating.nlargest(5).index
clubs_list = []
print_club_flag(clubs)
attck_list = ['Shooting', 'Power', 'Passing']
best_attack_df = players.groupby('Club')[attck_list].sum().sum(axis=1)
clubs = best_attack_df.nlargest(5).index
print_club_flag(clubs)
best_defense_df = players.groupby('Club')['Defending'].sum()
clubs = best_defense_df.nlargest(5).index
print_club_flag(clubs)
d = {'Overall': 'Average_Rating'}
best_overall_country_df = df.groupby('Nationality').agg({'Overall':'mean'}).rename(columns=d)
nations = best_overall_country_df.Average_Rating.nlargest(5).index
print_national_flag(nations)
best_3_uae = df[df['Nationality'] == 'United Arab Emirates']['Overall'].nlargest(3)
print(best_3_uae)
uae_df = df[df['Nationality'] == 'United Arab Emirates']
uae_df[uae_df['Overall'].isin(best_3_uae)]['Name']
best_attack_nation_df = players.groupby('Nationality')[attck_list].sum().sum(axis=1)
nations = best_attack_nation_df.nlargest(5).index
print_national_flag(nations)
best_defense_nation_df = players.groupby('Nationality')['Defending'].sum()
nations = best_defense_nation_df.nlargest(5).index
print_national_flag(nations)
import requests
import random
from math import pi
import matplotlib.image as mpimg
from matplotlib.offsetbox import (OffsetImage,AnnotationBbox)
def details(row, title, image, age, nationality, photo, logo, club):
flag_image = "img_flag.jpg"
player_image = "img_player.jpg"
logo_image = "img_club_logo.jpg"
img_flag = requests.get(image).content
with open(flag_image, 'wb') as handler:
handler.write(img_flag)
player_img = requests.get(photo).content
with open(player_image, 'wb') as handler:
handler.write(player_img)
logo_img = requests.get(logo).content
with open(logo_image, 'wb') as handler:
handler.write(logo_img)
r = lambda: random.randint(0,255)
colorRandom = '#%02X%02X%02X' % (r(),r(),r())
if colorRandom == '#ffffff':colorRandom = '#a5d6a7'
basic_color = '#37474f'
color_annotate = '#01579b'
img = mpimg.imread(flag_image)
#flg_img = mpimg.imread(logo_image)
plt.figure(figsize=(15,8))
categories=list(players)[1:]
coulumnDontUseGraph = ['Flag', 'Age', 'Nationality', 'Photo', 'Logo', 'Club']
N = len(categories) - len(coulumnDontUseGraph)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]
ax = plt.subplot(111, projection='polar')
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)
plt.xticks(angles[:-1], categories, color= 'black', size=17)
ax.set_rlabel_position(0)
plt.yticks([25,50,75,100], ["25","50","75","100"], color= basic_color, size= 10)
plt.ylim(0,100)
values = players.loc[row].drop('Name').values.flatten().tolist()
valuesDontUseGraph = [image, age, nationality, photo, logo, club]
values = [e for e in values if e not in (valuesDontUseGraph)]
values += values[:1]
ax.plot(angles, values, color= basic_color, linewidth=1, linestyle='solid')
ax.fill(angles, values, color= colorRandom, alpha=0.5)
axes_coords = [0, 0, 1, 1]
ax_image = plt.gcf().add_axes(axes_coords,zorder= -1)
ax_image.imshow(img,alpha=0.5)
ax_image.axis('off')
ax.annotate('Nationality: ' + nationality.upper(), xy=(10,10), xytext=(103, 138),
fontsize= 12,
color = 'white',
bbox={'facecolor': color_annotate, 'pad': 7})
ax.annotate('Age: ' + str(age), xy=(10,10), xytext=(43, 180),
fontsize= 15,
color = 'white',
bbox={'facecolor': color_annotate, 'pad': 7})
ax.annotate('Team: ' + club.upper(), xy=(10,10), xytext=(92, 168),
fontsize= 12,
color = 'white',
bbox={'facecolor': color_annotate, 'pad': 7})
arr_img_player = plt.imread(player_image, format='jpg')
imagebox_player = OffsetImage(arr_img_player)
imagebox_player.image.axes = ax
abPlayer = AnnotationBbox(imagebox_player, (0.5, 0.7),
xybox=(313, 223),
xycoords='data',
boxcoords="offset points"
)
arr_img_logo = plt.imread(logo_image, format='jpg')
imagebox_logo = OffsetImage(arr_img_logo)
imagebox_logo.image.axes = ax
abLogo = AnnotationBbox(imagebox_logo, (0.5, 0.7),
xybox=(-320, -226),
xycoords='data',
boxcoords="offset points"
)
ax.add_artist(abPlayer)
ax.add_artist(abLogo)
plt.title(title, size=50, color= basic_color)
# defining a polar graph
def get_id_card(id = 0):
if 0 <= id < len(df.ID):
details(row = players.index[id],
title = players['Name'][id],
age = players['Age'][id],
photo = players['Photo'][id],
nationality = players['Nationality'][id],
image = players['Flag'][id],
logo = players['Club_Logo'][id],
club = players['Club'][id])
else:
print('The base has 17917 players. You can put positive numbers from 0 to 17917')
best_footballers = df['Overall'].nlargest(5)
for index in best_footballers.index:
get_id_card(index)
df.loc[df.groupby(df['Position'])['Potential'].idxmax()][['Name', 'Position', 'Overall', 'Age', 'Nationality', 'Club']]
#### sns.set(style = 'dark', palette = 'colorblind', color_codes = True)
x = df.Wage
plt.figure(figsize = (12, 8))
ax = sns.distplot(x, bins = 50, kde = False, color = 'm')
ax.set_xlabel(xlabel = 'Player Wage', fontsize = 16)
ax.set_ylabel(ylabel = 'Player Count',fontsize = 16)
ax.set_title(label = 'Histogram that shows the wage of the Players', fontsize = 20)
plt.show()
df[df['Wage']>300000][['Name','Age','Wage']]
df.groupby('Wage')['Overall'].mean().plot()
df.groupby('Age')['Overall'].mean().plot()
sns.countplot(x='Age', data=df)
df[df['Age']>40][['Name','Overall','Age','Nationality']]
new_wage = df[df['Wage']>10000]
new_wage['age_group'] = pd.cut(new_wage.Age, bins=4)
#new_wage.plot(x='age_group', y='Wage', kind = 'bar')
ax = new_wage.boxplot(column='Wage', by='age_group', showmeans=True)
ax.set_xlabel(xlabel = 'Age Group', fontsize = 20)
ax.set_ylabel(ylabel = 'Wage', fontsize = 20)
"""positions = ['CAM', 'CB', 'CDM', 'CF', 'CM', 'LAM',
'LB', 'LCB', 'LCM', 'LDM', 'LF', 'LM', 'LS', 'LW', 'LWB', 'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF',
'RM', 'RS', 'RW', 'RWB']"""
"""for i in positions:
print('\n\n','Top 10', i, 'in FIFA 19', '\n')
temp_df = df[df.Position == i]
print(temp_df.sort_values(i, ascending=False).head(10).reset_index()[['Name', i]])
#print(df.sort_values(temp_df, ascending=False).head(10).reset_index()[['Name', 'Nationality', 'Club', 'Overall']])"""